package com.wolfword.crawler.client.deal;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.json.JSONArray;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.wolfword.common.util.object.StringUtil;
import com.wolfword.crawler.client.config.Config;
import com.wolfword.crawler.client.config.CrawlerConfig;
import com.wolfword.crawler.client.dto.CrawlTaskWithHtmlDto;
import com.wolfword.crawler.client.entity.AnalyzedInfoListEntity;
import com.wolfword.crawler.client.entity.CrawlTaskEntity;
import com.wolfword.crawler.client.utils.SpringContextUtil;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;

/**
 * 蜘蛛为何愚蠢？因为叫它傻
 * 基于Crawler.java的进化类
 * @author ServerZhang
 * @date 2018年1月9日
 */
public class StupidSpider implements Runnable{
	
	private final Logger logger = LoggerFactory.getLogger(StupidSpider.class);
	
	private static Config config = SpringContextUtil.getBean(Config.class);
	private String crawlStorageFolder = config.getHtmlStorePath();
	
	
	//爬取结果，仅第一级页面，包含页面字符串数据
	private List<CrawlTaskWithHtmlDto> crawledResult = new ArrayList<CrawlTaskWithHtmlDto>();
	//爬取到的页面，包含第一级+第二级
	//private List<CrawlTaskEntity> crawledSubPage = new ArrayList<CrawlTaskEntity>();
	//一级页面中的所有链接
	private List<CrawlTaskEntity> crawledSubPages = new ArrayList<CrawlTaskEntity>();
	//页面解析结果
	private List<AnalyzedInfoListEntity> analyzedInfo = new ArrayList<AnalyzedInfoListEntity>();
	
	public static volatile StupidSpider instance = null;
	
	private StupidSpider(){
		
	}
	
	public static StupidSpider getInstance(){
		if (instance == null) {
			synchronized (StupidSpider.class) {
				if (instance == null) {
					instance = new StupidSpider();
				}
			}
		}
		return instance;
	}
	
	
	public List<CrawlTaskWithHtmlDto> getCrawledResult() {
		return crawledResult;
	}
	public void addCrawledResult(CrawlTaskWithHtmlDto entity){
		this.crawledResult.add(entity);
	}

	public List<CrawlTaskEntity> getCrawledSubPages() {
		return crawledSubPages;
	}
	public void addCrawledSubPages(CrawlTaskEntity entity){
		this.crawledSubPages.add(entity);
	}

	public List<AnalyzedInfoListEntity> getAnalyzedInfo() {
		return analyzedInfo;
	}

	public void addAnalyzedInfo(AnalyzedInfoListEntity entity) {
		this.analyzedInfo.add(entity);
	}
	public void addAnalyzedInfo(List<AnalyzedInfoListEntity> entities) {
		this.analyzedInfo.addAll(entities);
	}

	public void crawlHtmls() {

	}

	@SuppressWarnings("finally")
	public void run() {
		while (true) {
			/*try {*/
			JSONArray jsonArray = ConnectServer.getUncrawled();
			
			if(jsonArray == null){
				logger.info("=============无任务，休息30秒");
				try {
					Thread.sleep(30 * 1000);
				} catch (InterruptedException e) {
					e.printStackTrace();
					// TODO Auto-generated catch block
				}finally{
					continue;
				}
			}
			
			int length = jsonArray.length();
			if (length < 1) {
				logger.info("=============无任务，休息30秒");
				try {
					Thread.sleep(30 * 1000);
				} catch (InterruptedException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				} finally {
					continue;
				}
			}
			for (int i = 0; i < length; i++) {
				Object object = jsonArray.get(i);
				if (object == null) {
					logger.info("=============此条任务数据有误！");
					continue;
				}
				try {
					crawl(object);
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
					continue;
				}finally{
				}
				logger.info("完成比例：" + (i + 1) + "/" + length);
			}
			
			//所有任务爬取完成后，将爬取记录提交给服务端 
			StupidSpider singleSpider = getInstance();
			List<CrawlTaskWithHtmlDto> result = singleSpider.getCrawledResult();
			List<AnalyzedInfoListEntity> infoList = singleSpider.getAnalyzedInfo();
			ConnectServer.updateTaskFile(result);
			ConnectServer.insertAnalysisInfo(infoList);
			singleSpider.crawledResult = new ArrayList<CrawlTaskWithHtmlDto>();//重新初始化
			singleSpider.analyzedInfo = new ArrayList<AnalyzedInfoListEntity>();//重新初始化
			
			logger.info("=============休眠30秒");
			try {
				Thread.sleep(30 * 1000);
			} catch (InterruptedException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}finally{
				continue;
			}
			/*} catch (Exception e) {
				//e.printStackTrace();
System.err.println("=============此条任务数据爬取异常！");
				continue;
				//throw new CrawlException("爬取失败！");
			}*/
		}
	}
	
	/**
	 * 爬取网站：
	 * 爬取
	 * @param entityObj
	 * @author ServerZhang
	 * @throws Exception 
	 * @date 2018年1月12日
	 */
	private void crawl(Object entityObj) throws Exception{
		if (crawlStorageFolder == null) {
			crawlStorageFolder = "./htmls";
		}
		
		StupidSpider singleSpider = getInstance();
		//singleSpider.crawledSubPage = new ArrayList<CrawlTaskEntity>();//重新初始化
		singleSpider.crawledSubPages = new ArrayList<CrawlTaskEntity>();//重新初始化
		if (null == entityObj) {
logger.info("=============此条任务数据有误！");
			return;
		}
		CrawlTaskEntity entity = json2Entity(entityObj.toString());
		if (null == entity) {
logger.info("=============此条任务数据有误！");
			return;
		}
		
		Long id = entity.getId();
		String url = entity.getUrl();
		String containStr = entity.getContainStr();
		Long requestLogId = entity.getRequestLogId();
		Long rootUrlId = entity.getRootUrlId();
		if (StringUtil.isEmpty(id,rootUrlId)) {
logger.info("=============此条任务数据有误！");
			return;
		}
		
		// 判断要爬取的域名是否为空
		if (StringUtil.isEmpty(url)) {
logger.info("要爬取的域名为空！");
			return;
			//throw new CrawlException("要爬取的域名为空！");
		}
		// 配置需要包含字符，存储路径
		if (StringUtil.isEmpty(containStr)){
logger.info("containStr不能为空!");
			return;
			//throw new CrawlException("containStr不能为空!");
		}
		// 判断存储html的文件是否存在
		File file = new File(crawlStorageFolder);
		if (!file.exists()){
			file.mkdirs();
		}

		
		int numberOfCrawlers = 5;
		if (Config.crawlerThreadCount != null) {
			numberOfCrawlers = Config.crawlerThreadCount;
		}

		CrawlConfig config = new CrawlConfig();
		// 爬取数据存放目录
//		config.setCrawlStorageFolder(crawlStorageFolder + "/crawler");
		config.setCrawlStorageFolder(crawlStorageFolder);
		// 爬多少层，如入口A是1，A中有B，B中有C，那B是2，C中有D，C是3
		//config.setMaxDepthOfCrawling(-1);
		config.setMaxDepthOfCrawling(0);
		// 最多爬取多少个网页
		config.setMaxPagesToFetch(-1);
		// 如果意外终止掉了,配置恢复崩溃的爬虫
		// config.setResumableCrawling(true);

		// config.setShutdownOnEmptyQueue(false);
		// 同一个主机两个请求之间的延迟毫秒数，默认是200
		config.setPolitenessDelay(200);
		
		//设置连接超时毫秒数
		config.setConnectionTimeout(10*1000);

		PageFetcher pageFetcher = new PageFetcher(config);

		RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

		RobotstxtServer robotstxtServer = new RobotstxtServer(
				robotstxtConfig, pageFetcher);

		CrawlController controller = new CrawlController(config,
					pageFetcher, robotstxtServer);
		// 添加从哪个域名开始爬取
		controller.addSeed(url);
		
//		String storePath = crawlStorageFolder + "/"
//				+ containStr.replace(".", "_");
		Map<String, Object> map = new HashMap<String, Object>();
		map.put("containStr", containStr);
		map.put("storePath", crawlStorageFolder);
		map.put("suffix", ".html");
		map.put("id", id);
		map.put("parentId", entity.getParentId());
		map.put("requestLogId", requestLogId);
		map.put("firstUrl", url);
		map.put("rootUrlId", rootUrlId);
		controller.setCustomData(map);
		// 开始爬取
		controller.start(CrawlerConfig.class, numberOfCrawlers);
		
		controller.waitUntilFinish();
logger.info("----------------------------->爬取已经执行完毕！");
		
		//调用服务端统计接口
		/*ConnectServer.statistics(url);
	logger.info("----------------------------->统计完毕！");*/
		
		// 将爬行的子页面提交到服务器，服务器将进行存表和保存文件的操作
		                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
		List<CrawlTaskEntity> subPages = singleSpider.getCrawledSubPages();
		if (subPages!=null && subPages.size()>0) {
			//ConnectServer.uploadSubpageFile(subPages);
			ConnectServer.uploadSubpage(subPages);
		}
		
		singleSpider.crawledSubPages = new ArrayList<CrawlTaskEntity>();//重新初始化
		
		//将爬行的子页面提交到任务表
		/*List<CrawlTaskEntity> subPage = singleSpider.getCrawledSubPage();
	ConnectServer.insertNewTask(subPage);
	singleSpider.crawledSubPage = new ArrayList<CrawlTaskEntity>();//重新初始化
		 */		
		// 爬取完后，上传文件到服务器
		/*ConnectServer.uploadFiles(storePath, containStr);
	logger.info("----------------------------->上传文件完毕！");*/
	}
	
	/**
	 * 将json串转换成实体类
	 * @param entityJson
	 * @return
	 * @author ServerZhang
	 * @date 2018年1月12日
	 */
	private CrawlTaskEntity json2Entity(String entityJson){
		//检查字符串是否合法
    	if (!entityJson.startsWith("{") || !entityJson.endsWith("}")) {
    		return null;
		}
		CrawlTaskEntity entity = new CrawlTaskEntity();
		
		JSONObject jsonObject = new JSONObject(entityJson);
		Iterator keys = jsonObject.keys();
		while(keys.hasNext()){//遍历对象中的键值
			Object keyObj = keys.next();
			if (null == keyObj) {
				continue;
			}
			String key = keyObj.toString();
			Object valueObj = jsonObject.get(key);
			if (valueObj == null || valueObj.equals(0) || valueObj.toString().toLowerCase().equals("null")) {
				continue;
			}
			String value = valueObj.toString();
			switch (key) {
			case "id":
				entity.setId(Long.parseLong(value));
				break;
			case "parentId":
				entity.setParentId(Long.parseLong(value));
				break;
			case "url":
				entity.setUrl(value);
				break;
			case "urlShort":
				entity.setUrlShort(value);
				break;
			case "containStr":
				entity.setContainStr(value);
				break;
			case "rootUrlId":
				entity.setRootUrlId(Long.parseLong(value));
				break;
			case "storePath":
				entity.setStorePath(value);;
				break;
			default:
				break;
			}
		}
		return entity;
	}
}
